Task 1

Part a)

We are trying to understand if some combination of odds predicts over/under values well. First, we read the matches and odd_details datasets.

## Classes 'data.table' and 'data.frame':   3129 obs. of  10 variables:
##  $ matchId  : chr  "KjF6FiA6" "ILVbJgQm" "SGIEDVvJ" "YwL5xFHJ" ...
##  $ home     : chr  "tottenham" "aston villa" "wolves" "bolton" ...
##  $ away     : chr  "manchester city" "west ham" "stoke city" "fulham" ...
##  $ homescore: num  0 3 2 0 0 2 1 6 1 3 ...
##  $ awayscore: num  0 0 1 0 4 2 0 0 1 0 ...
##  $ isover   : logi  FALSE TRUE TRUE FALSE TRUE TRUE ...
##  $ is1      : logi  FALSE TRUE TRUE FALSE FALSE FALSE ...
##  $ is2      : logi  FALSE FALSE FALSE FALSE TRUE FALSE ...
##  $ isX      : logi  TRUE FALSE FALSE TRUE FALSE TRUE ...
##  $ isbts    : logi  FALSE FALSE TRUE FALSE FALSE TRUE ...
##  - attr(*, ".internal.selfref")=<externalptr>
## Classes 'data.table' and 'data.frame':   5675665 obs. of  7 variables:
##  $ matchId      : chr  "004f4ING" "004f4ING" "004f4ING" "004f4ING" ...
##  $ betType      : chr  "1x2" "1x2" "1x2" "1x2" ...
##  $ oddtype      : chr  "odd1" "odd1" "odd1" "odd1" ...
##  $ bookmaker    : chr  "10Bet" "10Bet" "12BET" "12BET" ...
##  $ date         : num  1.42e+09 1.42e+09 1.42e+09 1.42e+09 1.42e+09 ...
##  $ odd          : num  1.67 1.65 1.67 1.65 1.7 1.67 1.68 1.61 1.62 1.67 ...
##  $ totalhandicap: chr  NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>

We will need to transform the matches data set into a format which shows us actual over/under outcomes: (and 1x2 outcomes, for Part b)

# drop unplayed games, split scores, define columns showing
# outcomes
matches <- na.omit(matches[, `:=`(c("homescore", "awayscore"), 
    tstrsplit(score, ":"))][, `:=`(c("homescore", "awayscore"), 
    list(as.numeric(homescore), as.numeric(awayscore)))][, `:=`(c("date", 
    "leagueId", "type", "score"), NULL)][, `:=`(c("isover", "is1", 
    "is2", "isX", "isbts"), list(homescore + awayscore > handicaplvl, 
    homescore > awayscore, homescore < awayscore, homescore == 
        awayscore, homescore > 0 & awayscore > 0))])

The odd_details data set contains the odds we will use to perform PCA - however there are some non-unique odd types. For example, both the “over 2.5” and “over 0.5” odds have the “over” odd type. To make these unique, we will concatenate them with the handicap levels:

# elaborate oddtypes for an easier wide table construction
odd_details_latest <- (copy(odd_details)[betType == "ha", `:=`(oddtype, 
    paste(betType, oddtype, sep = ""))][betType == "ou", `:=`(oddtype, 
    paste(oddtype, totalhandicap, sep = ""))][betType == "ah", 
    `:=`(oddtype, paste(betType, oddtype, totalhandicap, sep = ""))][, 
    `:=`(maxdate, max(date)), by = list(matchId, bookmaker, oddtype)][date == 
    maxdate][, `:=`(c("maxdate", "date", "betType", "totalhandicap"), 
    NULL)])
# put into wide table format, each odd type is a feature
odd_details_wide <- dcast(copy(odd_details_latest), matchId + 
    bookmaker ~ oddtype, value.var = "odd")

Let’s view a small part of our wide table:

## Classes 'data.table' and 'data.frame':   10 obs. of  10 variables:
##  $ matchId  : chr  "004f4ING" "004f4ING" "004f4ING" "004f4ING" ...
##  $ bookmaker: chr  "10Bet" "12BET" "188BET" "1xBet" ...
##  $ 12       : num  1.22 NA NA 1.28 1.25 1.25 1.25 1.24 1.27 1.22
##  $ 1X       : num  1.12 NA NA 1.15 1.14 1.12 1.12 1.12 1.12 1.12
##  $ NO       : num  1.69 NA NA 1.72 1.78 NA 1.72 1.72 1.65 1.67
##  $ X2       : num  2.15 NA NA 2.4 2.25 2.2 2.26 2.25 2.3 2.2
##  $ YES      : num  2.05 NA NA 2.12 2.05 NA 2.02 2.02 2.1 2.05
##  $ ah1+0.5  : num  NA NA NA NA NA NA NA NA NA NA
##  $ ah1+1    : num  NA NA NA 1.04 NA NA NA NA NA NA
##  $ ah1+1.5  : num  NA NA NA 1.02 NA NA NA NA NA NA
##  - attr(*, "sorted")= chr  "matchId" "bookmaker"
##  - attr(*, ".internal.selfref")=<externalptr>
##      matchId bookmaker   12   1X   NO   X2  YES ah1+0.5 ah1+1 ah1+1.5
##  1: 004f4ING     10Bet 1.22 1.12 1.69 2.15 2.05      NA    NA      NA
##  2: 004f4ING     12BET   NA   NA   NA   NA   NA      NA    NA      NA
##  3: 004f4ING    188BET   NA   NA   NA   NA   NA      NA    NA      NA
##  4: 004f4ING     1xBet 1.28 1.15 1.72 2.40 2.12      NA  1.04    1.02
##  5: 004f4ING BetVictor 1.25 1.14 1.78 2.25 2.05      NA    NA      NA
##  6: 004f4ING   Betclic 1.25 1.12   NA 2.20   NA      NA    NA      NA
##  7: 004f4ING   Betsafe 1.25 1.12 1.72 2.26 2.02      NA    NA      NA
##  8: 004f4ING   Betsson 1.24 1.12 1.72 2.25 2.02      NA    NA      NA
##  9: 004f4ING    Betway 1.27 1.12 1.65 2.30 2.10      NA    NA      NA
## 10: 004f4ING    ComeOn 1.22 1.12 1.67 2.20 2.05      NA    NA      NA

AH (Asian Handicap) data introduces a lot of missing values, we might consider removing them and then filtering other columns which introduce missing values:

odd_details_wide_noah <- odd_details_wide[, `:=`(c(grep("ah", 
    colnames(odd_details_wide), fixed = TRUE)), NULL)]
# count NAs
na_count_by_bm <- copy(odd_details_wide_noah)[, lapply(.SD, function(x) sum(is.na(x))), 
    by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
# keep the 5 best bookmakers
bad_bookmakers <- na_count_by_bm[, total, by = bookmaker][order(-total)]
keep_bookmakers <- bad_bookmakers$bookmaker[(length(bad_bookmakers$bookmaker) - 
    4):length(bad_bookmakers$bookmaker)]
odd_details_wide_noah <- odd_details_wide_noah[bookmaker %in% 
    keep_bookmakers]

# function to remove columns with many NAs
filter.oddtypes <- function(wide_table, kgf) {
    if ((length(wide_table) < 11) | kgf < 20) {
        return(wide_table)
    }
    na_count <- copy(wide_table)[, lapply(.SD, function(x) sum(is.na(x))), 
        by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
    sum1 <- sum(na_count$total)
    bad_oddtypes <- unique(na_count[, colnames(.SD[, 3:(length(.SD) - 
        1)])[max.col(na_count[, 3:(length(.SD) - 1)], ties.method = "first")]])
    print(bad_oddtypes)
    na_count <- na_count[, `:=`(c(bad_oddtypes), NULL)][, `:=`(total, 
        rowSums(.SD[, 3:(length(.SD) - 1)]))]
    sum2 <- sum(na_count$total)
    removed <- sum1 - sum2
    rowremposs <- max(wide_table[, .N, by = matchId]$N) * (length(wide_table) - 
        2)
    print(paste("The removed number of NA entries is", removed, 
        "deleting a match instead could remove a maximum of", 
        rowremposs, "NA values."))
    keepgoingfactor <- removed/rowremposs
    wide_table <- wide_table[, `:=`(c(bad_oddtypes), NULL)]
    return(filter.oddtypes(wide_table, keepgoingfactor))
}

odd_details_wide_filtered <- filter.oddtypes(copy(odd_details_wide_noah), 
    100)

Functions to do PCA and MDS analyses:

pca.analysis <- function(wide_filtered_table, match.tbl, bmname, 
    condition, suppress = FALSE) {
    df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker == 
        bmname][, `:=`(bookmaker, NULL)]))
    df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE, 
        scale = max(x)))
    rownames(df) <- df[["matchId"]]
    pca <- princomp(df[, -1])
    if (!suppress) {
        plot(pca, main = paste("PCA for", bmname))
        print(paste("PCA for", bmname))
        print(summary(pca))
        print(pca$loadings)
    }
    mscores <- as.data.frame(pca$scores)
    mscores[["matchId"]] <- rownames(mscores)
    pca_m <- unique(merge(mscores, match.tbl[, c("matchId", ..condition)], 
        by = "matchId"))
    plot(pca_m[["Comp.1"]], pca_m[["Comp.2"]], col = ifelse(pca_m[[condition]], 
        "red", "black"), xlab = "Comp. 1", ylab = "Comp. 2", 
        main = paste("PCA shifted data for", bmname, "and", condition))
    legend("bottomright", c(condition, paste("NOT", condition)), 
        fill = c("red", "black"), cex = 0.75)
    return(list(pca_m, pca))
}

do.MDS <- function(wide_filtered_table, match.tbl, mthd, bmname, 
    condition) {
    df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker == 
        bmname][, `:=`(bookmaker, NULL)]))
    df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE, 
        scale = max(x)))
    rownames(df) <- df[["matchId"]]
    d <- dist(df[, -1], method = mthd)
    mds <- as.data.frame(cmdscale(d))
    mds[["matchId"]] <- rownames(mds)
    mds_m <- unique(merge(mds, match.tbl[, c("matchId", ..condition)], 
        by = "matchId"))
    plot(mds_m[["V1"]], mds_m[["V2"]], main = paste(bmname, mthd, 
        "Distance MDS for", condition), xlab = "Dim1", ylab = "Dim2", 
        col = ifelse(mds_m[[condition]], "red", "black"))
    legend("bottomright", c(condition, paste("NOT", condition)), 
        fill = c("red", "black"), cex = 0.75)
    invisible(NULL)
}

Do the PCA analysis for 1xBet, one of our selected bookmakers

pca1xB_ou <- pca.analysis(odd_details_wide_filtered, matches, 
    "1xBet", "isover")

## [1] "PCA for 1xBet"
## Importance of components:
##                           Comp.1     Comp.2      Comp.3      Comp.4
## Standard deviation     0.2591188 0.05757021 0.019418429 0.013205701
## Proportion of Variance 0.9425271 0.04652556 0.005293272 0.002448043
## Cumulative Proportion  0.9425271 0.98905263 0.994345906 0.996793948
##                             Comp.5       Comp.6       Comp.7       Comp.8
## Standard deviation     0.009926890 0.0079534815 0.0068718105 0.0044006633
## Proportion of Variance 0.001383319 0.0008879951 0.0006628853 0.0002718519
## Cumulative Proportion  0.998177268 0.9990652629 0.9997281481 1.0000000000
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5   0.228         0.457         0.318  0.221  0.740  0.185
## over2.5   0.405  0.155        -0.581  0.570 -0.126 -0.358       
## over3.5   0.460  0.360  0.352 -0.170 -0.695        -0.127       
## over4.5   0.512  0.406 -0.536  0.497  0.126         0.103       
## under1.5 -0.341  0.470 -0.452 -0.537 -0.131  0.161  0.356       
## under2.5 -0.318  0.458  0.303  0.134        -0.756              
## under3.5 -0.254  0.410  0.115  0.238  0.194  0.452 -0.398  0.545
## under4.5 -0.167  0.274  0.259  0.152  0.123  0.352        -0.811
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000

The PCA analysis for the rest of the bookmakers looks like this:

## [1] "PCA for ComeOn"
## Importance of components:
##                           Comp.1     Comp.2      Comp.3       Comp.4
## Standard deviation     0.2973914 0.07296216 0.013761519 0.0090895851
## Proportion of Variance 0.9389338 0.05651627 0.002010531 0.0008771345
## Cumulative Proportion  0.9389338 0.99545003 0.997460566 0.9983377005
##                              Comp.5       Comp.6       Comp.7       Comp.8
## Standard deviation     0.0082676987 0.0073477169 0.0044355514 0.0038157904
## Proportion of Variance 0.0007256837 0.0005731693 0.0002088687 0.0001545778
## Cumulative Proportion  0.9990633842 0.9996365535 0.9998454222 1.0000000000
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5   0.229  0.160  0.115  0.325  0.275  0.291  0.725  0.341
## over2.5   0.322  0.214  0.780        -0.414 -0.259              
## over3.5   0.460  0.364        -0.541  0.471  0.193 -0.299  0.110
## over4.5   0.502  0.420 -0.564  0.306 -0.308 -0.157        -0.199
## under1.5 -0.396  0.484 -0.124 -0.491 -0.463  0.169  0.288  0.166
## under2.5 -0.337  0.427         0.106  0.421 -0.711              
## under3.5 -0.275  0.368  0.124  0.501         0.413 -0.526  0.275
## under4.5 -0.183  0.261  0.163         0.211  0.285  0.150 -0.848
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000
## [1] "PCA for youwin"
## Importance of components:
##                           Comp.1     Comp.2     Comp.3      Comp.4
## Standard deviation     0.2410126 0.05393239 0.01429719 0.010020746
## Proportion of Variance 0.9455020 0.04734588 0.00332724 0.001634493
## Cumulative Proportion  0.9455020 0.99284787 0.99617511 0.997809607
##                              Comp.5       Comp.6       Comp.7       Comp.8
## Standard deviation     0.0074633453 0.0062020250 0.0050104154 0.0039110492
## Proportion of Variance 0.0009066715 0.0006261089 0.0004086301 0.0002489829
## Cumulative Proportion  0.9987162781 0.9993423870 0.9997510171 1.0000000000
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5   0.233  0.150  0.394         0.300  0.811         0.118
## over2.5   0.373  0.236  0.408 -0.163  0.450 -0.511  0.319 -0.216
## over3.5   0.445  0.342  0.178 -0.394 -0.641        -0.232  0.182
## over4.5   0.487  0.424 -0.606  0.445  0.117                     
## under1.5 -0.420  0.546 -0.314 -0.572  0.295                     
## under2.5 -0.318  0.401  0.121  0.241 -0.435  0.100  0.607 -0.311
## under3.5 -0.254  0.317  0.268  0.400        -0.249         0.727
## under4.5 -0.171  0.256  0.307  0.275               -0.675 -0.528
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000
## [1] "PCA for Betfair"
## Importance of components:
##                           Comp.1     Comp.2      Comp.3      Comp.4
## Standard deviation     0.2609809 0.06606928 0.024170156 0.009849291
## Proportion of Variance 0.9290363 0.05954076 0.007968455 0.001323199
## Cumulative Proportion  0.9290363 0.98857702 0.996545476 0.997868676
##                              Comp.5       Comp.6      Comp.7       Comp.8
## Standard deviation     0.0081504617 0.0065419734 0.005326085 0.0043197846
## Proportion of Variance 0.0009061073 0.0005837579 0.000386929 0.0002545302
## Cumulative Proportion  0.9987747828 0.9993585407 0.999745470 1.0000000000
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5   0.250         0.270  0.449  0.361  0.697  0.137  0.138
## over2.5   0.398  0.195  0.324  0.506 -0.528 -0.316        -0.236
## over3.5   0.503  0.315  0.453 -0.571  0.254 -0.165         0.120
## over4.5   0.441  0.459 -0.766                                   
## under1.5 -0.342  0.518  0.109 -0.240 -0.542  0.350  0.192  0.303
## under2.5 -0.329  0.442  0.111         0.230        -0.166 -0.771
## under3.5 -0.267  0.335         0.375  0.197 -0.295 -0.580  0.457
## under4.5 -0.189  0.259         0.132  0.374 -0.405  0.748  0.124
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000
## [1] "PCA for Betfair Exchange"
## Importance of components:
##                           Comp.1     Comp.2      Comp.3      Comp.4
## Standard deviation     0.2397279 0.08584109 0.016023518 0.011871509
## Proportion of Variance 0.8779785 0.11257378 0.003922496 0.002153072
## Cumulative Proportion  0.8779785 0.99055229 0.994474786 0.996627858
##                             Comp.5      Comp.6       Comp.7       Comp.8
## Standard deviation     0.010208065 0.007612929 0.0064106690 0.0041798190
## Proportion of Variance 0.001591965 0.000885422 0.0006278465 0.0002669081
## Cumulative Proportion  0.998219823 0.999105245 0.9997330919 1.0000000000
## 
## Loadings:
##          Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## over1.5   0.237  0.111  0.124         0.538  0.734  0.230  0.175
## over2.5   0.392  0.235  0.384         0.548 -0.546 -0.152 -0.147
## over3.5   0.489  0.336  0.453        -0.638  0.147              
## over4.5   0.497  0.351 -0.789                                   
## under1.5 -0.337  0.555        -0.685                0.300  0.109
## under2.5 -0.316  0.479         0.185         0.287 -0.619 -0.412
## under3.5 -0.247  0.337         0.509        -0.198         0.724
## under4.5 -0.172  0.211         0.471        -0.114  0.667 -0.491
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.125  0.125  0.125  0.125  0.125  0.125  0.125  0.125
## Cumulative Var  0.125  0.250  0.375  0.500  0.625  0.750  0.875  1.000

It seems that over 90% of the total variance seems to be covered by the first principal component for all bookmakers. We can note that each plot of the data on the first two principal components is bow shaped - this indicates that our analysis produced somewhat consistent results. The plots show some small but discernable separation between the red and black dots, although the center of the plot does not seem very decisive for any bookmaker and it would be difficult to use these odds predictively unless they were quite extreme.

In the case of each bookmaker, the eigencvectors of PCA are made up of various over and under odds with handicaps between 1.5 and 4.5. We can surmise that most of the variance in the data comes from such odds which capture the over/under likelihoods better than they do the 1x2 likelihoods.

Part b and c)

We can simply run the do.MDS() function shown above to the same datasets to do a MDS analysis, simply by calling:

do.MDS(odd_details_wide_filtered, matches, "euclidean", "1xBet", 
    "isover")

do.MDS(odd_details_wide_filtered, matches, "manhattan", "1xBet", 
    "isover")

The rest of our MDS plots are below:

The main finding is that Euclidean MDS produces bow-shaped plots that are very similar to that of PCA, and there does seem to be some separation between the black and red points. It also can be said that the spread of the points looks higher with Euclidean MDS than in PCA. In Manhattan MDS, possibly due to the distance value not being squared (and therefore negative distances being introduced), we see a divergent pattern that makes it hard to make any predictions. It also must be noted that neither of the MDS methods offer anything like PCA eigenvectors, which can be used to understand where the variance in the data comes from. This (and the higher plot spread) makes MDS look worse as a predictive tool than PCA.

Task 2

In the plots below, “is1”, “is2” and “isX” indicate a home team win, and away team win, and a draw, respectively.

An interesting finding with these plots is that, while the PCA model uses only over and under odds in the eigenvectors, there is some kind of a predictive pattern visible, especially with the “is1” (home team wins) plots. We can furthermore see that a hypothetical data point in the edges of the bow-shaped plot indicates that a draw is not likely. However, it is still difficult to say whether a point in the center of the bow will be a draw.

Task 3

Image structure and dimensions:

str(image)
##  num [1:512, 1:512, 1:3] 0.796 0.796 0.784 0.792 0.816 ...
dim(image)
## [1] 512 512   3

Add jitter to images:

image_noisy <- jitter(image, amount = 0.1)
# ensure intensities stay within 0-1 range
image_noisy[which(image_noisy > 1)] <- 1
image_noisy[which(image_noisy < 0)] <- 0

Convert a smaller, noisy image to greyscale to do PCA analysis:

dim(image_small)
## [1] 256 256   3
image_noisy_small <- jitter(image_small, amount = 0.1)
image_noisy_small[which(image_noisy_small > 1)] <- 1
image_noisy_small[which(image_noisy_small < 0)] <- 0
image_noisy_greyscale_small <- (image_noisy_small[, , 1] + image_noisy_small[, 
    , 2] + image_noisy_small[, , 3])/3

Part a)

Divide the image into 3x3 submatrices (patches) and do PCA with “position within patch”" (top-right, bottom, etc.) as features and each patch as instances:

plot(pca_patch)

summary(pca_patch)
## Importance of components:
##                           Comp.1     Comp.2     Comp.3     Comp.4
## Standard deviation     0.5442816 0.09822836 0.09044627 0.06012628
## Proportion of Variance 0.9020787 0.02938130 0.02491027 0.01100844
## Cumulative Proportion  0.9020787 0.93146000 0.95637027 0.96737871
##                             Comp.5      Comp.6      Comp.7     Comp.8
## Standard deviation     0.052716089 0.048900599 0.047877230 0.04101685
## Proportion of Variance 0.008462204 0.007281578 0.006979996 0.00512297
## Cumulative Proportion  0.975840910 0.983122488 0.990102484 0.99522545
##                             Comp.9
## Standard deviation     0.039597471
## Proportion of Variance 0.004774545
## Cumulative Proportion  1.000000000
pca_patch$loadings
## 
## Loadings:
##    Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
## 11  0.328  0.514  0.148  0.561                0.435  0.173  0.255
## 21  0.335  0.246  0.368         0.232  0.402 -0.508  0.218 -0.410
## 31  0.332 -0.147  0.542 -0.329  0.364 -0.350  0.151 -0.400  0.170
## 12  0.334  0.395 -0.216        -0.460 -0.309 -0.198 -0.513 -0.277
## 22  0.341               -0.390 -0.441  0.474                0.558
## 32  0.335 -0.393  0.215        -0.452 -0.302  0.206  0.502 -0.311
## 13  0.330  0.146 -0.548 -0.325  0.370 -0.354 -0.134  0.402  0.146
## 23  0.335 -0.244 -0.369         0.225  0.422  0.491 -0.248 -0.398
## 33  0.330 -0.512 -0.142  0.561               -0.435 -0.155  0.270
## 
##                Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8
## SS loadings     1.000  1.000  1.000  1.000  1.000  1.000  1.000  1.000
## Proportion Var  0.111  0.111  0.111  0.111  0.111  0.111  0.111  0.111
## Cumulative Var  0.111  0.222  0.333  0.444  0.556  0.667  0.778  0.889
##                Comp.9
## SS loadings     1.000
## Proportion Var  0.111
## Cumulative Var  1.000

We can see that the first principal component covers over 90 percent of the variance in each patch, and is the scaled sum of each pixel within the patch, with a slight bias for the center pixel.

Part b)

We can see that the first PC is sufficient to meaningfully recreate the image, while the second and third PCs seem to show a bias for certain edges.

Part c)

It can be seen that the first PC focuses on all pixels - but mostly the center, whereas the second and third PCs focus on the bottom-left and bottom-right corners, respectively.

Appendix A - Code For Tasks 1 & 2

require(data.table)
require(anytime)

setwd("./Desktop/okul/Data Mining/data/")
matches_path <- "df9b1196-e3cf-4cc7-9159-f236fe738215_matches.rds"
odd_details_path <- "df9b1196-e3cf-4cc7-9159-f236fe738215_odd_details.rds"

handicaplvl = 2.5
matches <- readRDS(matches_path)
# drop unplayed games, split scores, define columns showing
# outcomes
matches <- na.omit(matches[, `:=`(c("homescore", "awayscore"), 
    tstrsplit(score, ":"))][, `:=`(c("homescore", "awayscore"), 
    list(as.numeric(homescore), as.numeric(awayscore)))][, `:=`(c("date", 
    "leagueId", "type", "score"), NULL)][, `:=`(c("isover", "is1", 
    "is2", "isX", "isbts"), list(homescore + awayscore > handicaplvl, 
    homescore > awayscore, homescore < awayscore, homescore == 
        awayscore, homescore > 0 & awayscore > 0))])


odd_details <- readRDS(odd_details_path)
# elaborate oddtypes for an easier wide table construction
odd_details_latest <- (copy(odd_details)[betType == "ha", `:=`(oddtype, 
    paste(betType, oddtype, sep = ""))][betType == "ou", `:=`(oddtype, 
    paste(oddtype, totalhandicap, sep = ""))][betType == "ah", 
    `:=`(oddtype, paste(betType, oddtype, totalhandicap, sep = ""))][, 
    `:=`(maxdate, max(date)), by = list(matchId, bookmaker, oddtype)][date == 
    maxdate][, `:=`(c("maxdate", "date", "betType", "totalhandicap"), 
    NULL)])
# put into wide table format, each odd type is a feature
odd_details_wide <- dcast(copy(odd_details_latest), matchId + 
    bookmaker ~ oddtype, value.var = "odd")
odd_details_wide_noah <- copy(odd_details_wide)[, `:=`(c(grep("ah", 
    colnames(odd_details_wide), fixed = TRUE)), NULL)]
na_count_by_bm <- copy(odd_details_wide_noah)[, lapply(.SD, function(x) sum(is.na(x))), 
    by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
bad_bookmakers <- na_count_by_bm[, total, by = bookmaker][order(-total)]
keep_bookmakers <- bad_bookmakers$bookmaker[(length(bad_bookmakers$bookmaker) - 
    4):length(bad_bookmakers$bookmaker)]
odd_details_wide_noah <- odd_details_wide_noah[bookmaker %in% 
    keep_bookmakers]

filter.oddtypes <- function(wide_table, kgf) {
    if ((length(wide_table) < 11) | kgf < 20) {
        return(wide_table)
    }
    na_count <- copy(wide_table)[, lapply(.SD, function(x) sum(is.na(x))), 
        by = bookmaker][, `:=`(total, rowSums(.SD[, 3:length(.SD)]))]
    sum1 <- sum(na_count$total)
    bad_oddtypes <- unique(na_count[, colnames(.SD[, 3:(length(.SD) - 
        1)])[max.col(na_count[, 3:(length(.SD) - 1)], ties.method = "first")]])
    print(bad_oddtypes)
    na_count <- na_count[, `:=`(c(bad_oddtypes), NULL)][, `:=`(total, 
        rowSums(.SD[, 3:(length(.SD) - 1)]))]
    sum2 <- sum(na_count$total)
    removed <- sum1 - sum2
    rowremposs <- max(wide_table[, .N, by = matchId]$N) * (length(wide_table) - 
        2)
    print(paste("The removed number of NA entries is", removed, 
        "deleting a match instead could remove a maximum of", 
        rowremposs, "NA values."))
    keepgoingfactor <- removed/rowremposs
    wide_table <- wide_table[, `:=`(c(bad_oddtypes), NULL)]
    return(filter.oddtypes(wide_table, keepgoingfactor))
}

odd_details_wide_filtered <- filter.oddtypes(copy(odd_details_wide_noah), 
    100)

pca.analysis <- function(wide_filtered_table, match.tbl, bmname, 
    condition, suppress = FALSE) {
    df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker == 
        bmname][, `:=`(bookmaker, NULL)]))
    df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE, 
        scale = max(x)))
    rownames(df) <- df[["matchId"]]
    pca <- princomp(df[, -1])
    if (!suppress) {
        plot(pca, main = paste("PCA for", bmname))
        print(paste("PCA for", bmname))
        print(summary(pca))
        print(pca$loadings)
    }
    mscores <- as.data.frame(pca$scores)
    mscores[["matchId"]] <- rownames(mscores)
    pca_m <- unique(merge(mscores, match.tbl[, c("matchId", ..condition)], 
        by = "matchId"))
    plot(pca_m[["Comp.1"]], pca_m[["Comp.2"]], col = ifelse(pca_m[[condition]], 
        "red", "black"), xlab = "Comp. 1", ylab = "Comp. 2", 
        main = paste("PCA shifted data for", bmname, "and", condition))
    legend("bottomright", c(condition, paste("NOT", condition)), 
        fill = c("red", "black"), cex = 0.75)
    return(list(pca_m, pca))
}

do.MDS <- function(wide_filtered_table, match.tbl, mthd, bmname, 
    condition) {
    df <- as.data.frame(na.omit(copy(wide_filtered_table)[bookmaker == 
        bmname][, `:=`(bookmaker, NULL)]))
    df[, -1] <- lapply(df[, -1], function(x) scale(x, center = FALSE, 
        scale = max(x)))
    rownames(df) <- df[["matchId"]]
    d <- dist(df[, -1], method = mthd)
    mds <- as.data.frame(cmdscale(d))
    mds[["matchId"]] <- rownames(mds)
    mds_m <- unique(merge(mds, match.tbl[, c("matchId", ..condition)], 
        by = "matchId"))
    plot(mds_m[["V1"]], mds_m[["V2"]], main = paste(bmname, mthd, 
        "Distance MDS for", condition), xlab = "Dim1", ylab = "Dim2", 
        col = ifelse(mds_m[[condition]], "red", "black"))
    legend("bottomright", c(condition, paste("NOT", condition)), 
        fill = c("red", "black"), cex = 0.75)
    invisible(NULL)
}

pca1xB_ou <- pca.analysis(odd_details_wide_filtered, matches, 
    "1xBet", "isover")
pcacom_ou <- pca.analysis(odd_details_wide_filtered, matches, 
    "ComeOn", "isover")
pcayou_ou <- pca.analysis(odd_details_wide_filtered, matches, 
    "youwin", "isover")
pcaBet_ou <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair", "isover")
pcaBEx_ou <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair Exchange", "isover")

pca1xB_1 <- pca.analysis(odd_details_wide_filtered, matches, 
    "1xBet", "is1")
pcacom_1 <- pca.analysis(odd_details_wide_filtered, matches, 
    "ComeOn", "is1")
pcayou_1 <- pca.analysis(odd_details_wide_filtered, matches, 
    "youwin", "is1")
pcaBet_1 <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair", "is1")
pcaBEx_1 <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair Exchange", "is1")

pca1xB_2 <- pca.analysis(odd_details_wide_filtered, matches, 
    "1xBet", "is2")
pcacom_2 <- pca.analysis(odd_details_wide_filtered, matches, 
    "ComeOn", "is2")
pcayou_2 <- pca.analysis(odd_details_wide_filtered, matches, 
    "youwin", "is2")
pcaBet_2 <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair", "is2")
pcaBEx_2 <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair Exchange", "is2")

pca1xB_X <- pca.analysis(odd_details_wide_filtered, matches, 
    "1xBet", "isX")
pcacom_X <- pca.analysis(odd_details_wide_filtered, matches, 
    "ComeOn", "isX")
pcayou_X <- pca.analysis(odd_details_wide_filtered, matches, 
    "youwin", "isX")
pcaBet_X <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair", "isX")
pcaBEx_X <- pca.analysis(odd_details_wide_filtered, matches, 
    "Betfair Exchange", "isX")

do.MDS(odd_details_wide_filtered, matches, "euclidean", "1xBet", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "ComeOn", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "youwin", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "Betfair", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "euclidean", "Betfair Exchange", 
    "isover")

do.MDS(odd_details_wide_filtered, matches, "manhattan", "1xBet", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "ComeOn", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "youwin", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "Betfair", 
    "isover")
do.MDS(odd_details_wide_filtered, matches, "manhattan", "Betfair Exchange", 
    "isover")

Appendix B - Code For Task 3

require(jpeg)
require(data.table)

setwd("./Desktop/okul/Data Mining/data/")
image <- readJPEG("RGBclumsy.jpg")
str(image)
dim(image)

plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(image, 0, 0, 1, 1)
rasterImage(image = image[, , 1], 0, 0, 1, 1)
rasterImage(image = image[, , 2], 0, 0, 1, 1)
rasterImage(image = image[, , 3], 0, 0, 1, 1)

image_noisy <- jitter(image, amount = 0.1)
image_noisy[which(image_noisy > 1)] <- 1
image_noisy[which(image_noisy < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(image_noisy, 0, 0, 1, 1)
rasterImage(image = image_noisy[, , 1], 0, 0, 1, 1)
rasterImage(image = image_noisy[, , 2], 0, 0, 1, 1)
rasterImage(image = image_noisy[, , 3], 0, 0, 1, 1)

image_noisy_greyscale <- (image_noisy[, , 1] + image_noisy[, 
    , 2] + image_noisy[, , 3])/3
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(image_noisy_greyscale, 0, 0, 1, 1)

image_small <- readJPEG("RGBclumsysmall.jpg")
str(image_small)
dim(image_small)
image_noisy_small <- jitter(image_small, amount = 0.1)
image_noisy_small[which(image_noisy_small > 1)] <- 1
image_noisy_small[which(image_noisy_small < 0)] <- 0
image_noisy_greyscale_small <- (image_noisy_small[, , 1] + image_noisy_small[, 
    , 2] + image_noisy_small[, , 3])/3

# 3x3 submatrices
patchdim <- 3
gap <- ((patchdim - 1)/2)
patches <- matrix(nrow = patchdim^2, ncol = ((ncol(image_noisy_greyscale_small) - 
    (2 * gap))^2))
colnames(patches) <- rep("noname", times = ncol(patches))
k <- 1
kmax <- ncol(patches)
for (i in (gap + 1):(nrow(image_noisy_greyscale_small) - gap)) {
    for (j in (gap + 1):(ncol(image_noisy_greyscale_small) - 
        gap)) {
        patch <- image_noisy_greyscale_small[(i - gap):(i + gap), 
            (j - gap):(j + gap)]
        patches[, k] <- c(patch)
        colnames(patches)[k] <- paste(i, j)
        k <- k + 1
        print(kmax - k)
    }
}

patches_t <- t(patches)
colnames(patches_t) <- c("11", "21", "31", "12", "22", "32", 
    "13", "23", "33")
pca_patch <- princomp(patches_t)
plot(pca_patch)
summary(pca_patch)
pca_patch$loadings

mapping <- (as.data.table(pca_patch$scores)[, `:=`(pos, rownames(pca_patch$scores))][, 
    `:=`(c("xpos", "ypos"), tstrsplit(pos, " "))][, `:=`(c("xpos", 
    "ypos"), list(as.numeric(xpos), as.numeric(ypos)))][, `:=`(pos, 
    NULL)])

new_img1 <- matrix(nrow = (ncol(image_noisy_greyscale_small) - 
    (2 * gap)), ncol = (ncol(image_noisy_greyscale_small) - (2 * 
    gap)))
for (i in 1:nrow(new_img1)) {
    for (j in 1:ncol(new_img1)) {
        new_img1[i, j] <- mapping[(xpos == (i + 1)) & (ypos == 
            (j + 1))][, Comp.1]
    }
    print(i)
}
new_img1[which(new_img1 > 1)] <- 1
new_img1[which(new_img1 < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(new_img1, 0, 0, 1, 1)

new_img2 <- matrix(nrow = (ncol(image_noisy_greyscale_small) - 
    (2 * gap)), ncol = (ncol(image_noisy_greyscale_small) - (2 * 
    gap)))
for (i in 1:nrow(new_img2)) {
    for (j in 1:ncol(new_img2)) {
        new_img2[i, j] <- mapping[(xpos == (i + 1)) & (ypos == 
            (j + 1))][, Comp.2]
    }
    print(i)
}
new_img2[which(new_img2 > 1)] <- 1
new_img2[which(new_img2 < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(new_img2, 0, 0, 1, 1)

new_img3 <- matrix(nrow = (ncol(image_noisy_greyscale_small) - 
    (2 * gap)), ncol = (ncol(image_noisy_greyscale_small) - (2 * 
    gap)))
for (i in 1:nrow(new_img3)) {
    for (j in 1:ncol(new_img3)) {
        new_img3[i, j] <- mapping[(xpos == (i + 1)) & (ypos == 
            (j + 1))][, Comp.3]
    }
    print(i)
}
new_img3[which(new_img3 > 1)] <- 1
new_img3[which(new_img3 < 0)] <- 0
plot(c(0, 1), c(0, 1), type = "n", xlab = "", ylab = "")
rasterImage(new_img3, 0, 0, 1, 1)

small_img1 <- matrix(pca_patch$loadings[, 1], 3, 3)
small_img1[which(small_img1 > 1)] <- 1
small_img1[which(small_img1 < 0)] <- 0
image(small_img1, col = gray((0:255)/255))

small_img2 <- matrix(pca_patch$loadings[, 2], 3, 3)
small_img2[which(small_img2 > 1)] <- 1
small_img2[which(small_img2 < 0)] <- 0
image(small_img2, col = gray((0:255)/255))

small_img3 <- matrix(pca_patch$loadings[, 3], 3, 3)
small_img3[which(small_img3 > 1)] <- 1
small_img3[which(small_img3 < 0)] <- 0
image(small_img3, col = gray((0:255)/255))